# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import plotly.express as px
from copy import copy
from scipy import stats
import matplotlib.pyplot as plt
import numpy as np
import plotly.figure_factory as ff
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from tensorflow import keras
# Read stock prices data
stock_price_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/stocks.csv')
stock_price_df
# Read the stocks volume data
stock_vol_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/stock_volume.csv')
stock_vol_df
# Sort the data based on Date
stock_price_df = stock_price_df.sort_values(by = ['Date'])
stock_price_df
# Sort the volume data based on Date
stock_vol_df = stock_vol_df.sort_values(by = ['Date'])
stock_vol_df
# Check if Null values exist in stock prices data
stock_price_df.isnull().sum()
# Check if Null values exist in stocks volume data
stock_vol_df.isnull().sum()
# Get stock prices dataframe info
stock_price_df.info()
# Get stock volume dataframe info
stock_vol_df.info()
# The average trading volume for Apple stock:
stock_vol_df['AAPL'].mean()
stock_vol_df['GOOG'].mean()
# The maximum trading volume for sp500:
stock_vol_df['sp500'].max()
# The maximum price of Tesla Stock:
stock_price_df['TSLA'].max()
# The average stock price of the S&P500 over this time period:
stock_price_df['sp500'].mean()
Indented block
# Function to normalize stock prices based on their initial price
def normalize(df):
x = df.copy()
for i in x.columns[1:]:
x[i] = x[i]/x[i][0]
return x
# Function to plot interactive plots using Plotly Express
def interactive_plot(df, title):
fig = px.line(title = title)
for i in df.columns[1:]:
fig.add_scatter(x = df['Date'], y = df[i], name = i)
fig.show()
# Plot interactive chart for Normalized stocks data
interactive_plot(normalize(stock_price_df), 'Normalized Stock Price Data')
# Plot interactive chart for Non-normalized stocks data
interactive_plot(stock_price_df, 'Non-normalized Stock Prices')
wo_sp500_price = stock_price_df.drop('sp500', axis=1)
interactive_plot(wo_sp500_price, 'Stock Price Data Without SP500')
# Plot interactive chart for stock volume data
interactive_plot(stock_vol_df, 'Stock Volume Data')
# Plot interactive chart for stocks data without SP500
wo_sp500_vol = stock_vol_df.drop('sp500', axis=1)
interactive_plot(wo_sp500_vol, 'Stock Volume Data Without SP500')
# Plot interactive chart for normalized stock volume data
interactive_plot(normalize(stock_vol_df), 'Normalized Stock Volume Data')
# Function to concatenate the date, stock price, and volume in one dataframe
def individual_stock(price_df, vol_df, name):
return pd.DataFrame({'Date':price_df['Date'], 'Close':price_df[name], 'Volume':vol_df[name] })
# Function to return the input/output (target) data for AI/ML Model
# Note that our goal is to predict the future stock price
# Target stock price today will be tomorrow's price
def trading_window(data):
n = 1 #can change this to 10 for example
data['target'] = data[['Close']].shift(-n)
return data
# Let's test the functions and get individual stock prices and volumes for AAPL
price_volume_df = individual_stock(stock_price_df, stock_vol_df, 'AAPL')
price_volume_df
price_volume_target_df = trading_window(price_volume_df)
price_volume_target_df
# Remove the last row as it will be a null value
price_volume_target_df = price_volume_target_df[:-1]
price_volume_target_df
# Scale the data
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range = (0,1))
# Since we want to scale every thing except the date column we just drop it!
price_volume_target_scaled_df = sc.fit_transform(price_volume_target_df.drop(columns = ['Date']))
price_volume_target_scaled_df
price_volume_target_scaled_df.shape
# Create Feature and Target
X = price_volume_target_scaled_df[:, :2] #features is everything except the last 'Target' column
y = price_volume_target_scaled_df[:, 2:]
# Spliting the data this way, since order is important in time-series
# Note that we did not use train test split with it's default settings since it shuffles the data
X
y
X.shape, y.shape
# Split the data for 65% Training, 35% Testing
split = int(0.65 * len(X))
X_train, y_train = X[:split], y[:split]
X_test, y_test = X[split:], y[split:]
X_train.shape, y_train.shape
X_test.shape, y_test.shape
# Define a data plotting function
def show_plot(data, title):
plt.figure(figsize = (13, 5))
plt.plot(data, linewidth = 3)
plt.title(title)
plt.grid()
show_plot(X_train, 'Training Dataset')
show_plot(X_test, 'Testing Dataset')
def scale_split_dateset(df, split = 0.65):
sc = MinMaxScaler(feature_range = (0,1))
# Since we want to scale every thing except the date column we just drop it!
# if df['Date']:
scaled_df = sc.fit_transform(df.drop(columns = ['Date']))
X = scaled_df[:, :2] #features is everything except the last 'Target' column
y = scaled_df[:, 2:]
split = int(split * len(X))
X_train, y_train = X[:split], y[:split]
X_test, y_test = X[split:], y[split:]
show_plot(X_train, 'Training Dataset')
show_plot(X_test, 'Testing Dataset')
return X_train, y_train, X_test, y_test
# Make dataset for AAPL stock
# Create a Price-Vol-Target dataframe dropping the last row since it has an 'NaN'
aapl_target_df = trading_window(individual_stock(stock_price_df, stock_vol_df, 'AAPL'))[:-1]
# Scale and Split dataframe in to Train/Test datasets for training
X_train, y_train, X_test, y_test = scale_split_dateset(aapl_target_df)
print('Training shape: {},{} and Testing shape: {},{}'.format(X_train.shape, y_train.shape, X_test.shape, y_test.shape ))
from sklearn.linear_model import Ridge
# Note that Ridge regression performs linear least squares with L2 regularization.
# Create and train the Ridge Linear Regression Model
lr = Ridge()
lr.fit(X_train, y_train)
# Test the model and calculate its accuracy
lr_accuracy = lr.score(X_test, y_test)
print('My Ridge Regression Score is: {}'.format(lr_accuracy))
# Make Prediction
predicted_prices = lr.predict(X)
predicted_prices
# Append the predicted values into a list
predicted_list = []
for i in predicted_prices:
predicted_list.append(i[0])
len(predicted_list)
# Append the close values to the list
true_list = []
for i in price_volume_target_scaled_df:
true_list.append(i[0])
len(true_list)
price_volume_target_df[['Date']]
# Create a dataframe based on the dates in the individual stock data
df_predicted = price_volume_target_df[['Date']]
# Add predicted_list, true_list columns
df_predicted['True $'] = true_list
df_predicted['Predicted $'] = predicted_list
df_predicted
# Plot the results
interactive_plot(df_predicted, 'Ground Truth vs. Predicted Prices')
# Let's test the functions and get individual stock prices and volumes for AAPL
stk_price_volume_df = individual_stock(stock_price_df, stock_vol_df, 'T')
stk_price_volume_df
# Get the close and volume data as training data (Input)
training_data = stk_price_volume_df.iloc[:, 1:3].values #drop Date and just get Close and Volume for training data
training_data
# Normalize the data
sc = MinMaxScaler(feature_range= (0,1))
training_set_scaled = sc.fit_transform(training_data)
training_set_scaled
# Create the training and testing data, training data contains present day and previous day values
X = []
y = []
for i in range(1, len(stk_price_volume_df)):
X.append(training_set_scaled[i-1:i, 0])
y.append(training_set_scaled[i, 0])
X
# Convert the data into array format
X = np.asarray(X)
y = np.asarray(y)
X
# Split the data 70/30 for train-test datasets
split = int(0.7 * len(X))
X_train, y_train = X[:split], y[:split]
X_test, y_test = X[split:], y[split:]
print('Training shape: {},{} and Testing shape: {},{}'.format(X_train.shape, y_train.shape, X_test.shape, y_test.shape ))
# Reshape the 1D arrays to 3D arrays to feed in the LSTM model
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
print('Now my Training shape: {},{} and Testing shape: {},{}'.format(X_train.shape, y_train.shape, X_test.shape, y_test.shape ))
# Create a basice LSTM model using Keras
inputs = keras.layers.Input(shape = (X_train.shape[1], X_train.shape[2] ))
x = keras.layers.LSTM(150, return_sequences=True)(inputs)
x = keras.layers.LSTM(150, return_sequences=True)(x)
x = keras.layers.LSTM(150, return_sequences=True)(x)
outputs = keras.layers.Dense(1, activation='linear')(x) #Use 'linear' since we have a continuous stock price
model = keras.Model(inputs = inputs, outputs = outputs)
model.compile(optimizer = 'adam', loss = 'mse' ) #Use 'adam' is the most common
model.summary()
# Train the model
history = model.fit(X_train, y_train, epochs = 30, batch_size= 32, validation_split=0.2)
# Make prediction
predicted = model.predict(X)
predicted
# Append the predicted values to a list
test_predicted = []
for i in predicted:
test_predicted.append(i[0][0])
#test_predicted
# Now, build a dataframe with date and SCALED data for the Close and Predicted price
df_predicted = price_volume_df[1:][['Date']]
df_predicted
training_set_scaled[0]
# Now, add the SCALED Closing price to this dataframe
close = []
for i in training_set_scaled:
close.append(i[0])
df_predicted['Close'] = close[1:]
df_predicted
# Now, add my predictions to this dataframe
df_predicted['Predictions'] = test_predicted
df_predicted
# Now, Show the interactive plot of the true and predicted prices
interactive_plot(df_predicted, 'Original Price vs. LSTM Model Predictions')
def make_training_data(stk_price_volume_df, split_ratio = 0.80):
# Get the close and volume data as training data (Input)
training_data = stk_price_volume_df.iloc[:, 1:3].values #drop Date and just get Close and Volume for training data
# print(training_data)
# print('')
# Create scaler instance and Normalize the data
sc = MinMaxScaler(feature_range= (0,1))
training_set_scaled = sc.fit_transform(training_data)
# Create the Features and Labels for training/test data contains present day and previous day values
X = []
y = []
for i in range(1, len(stk_price_volume_df)): #want to start at the 2nd row so start at '1'
X.append(training_set_scaled[i-1:i, 0]) #make features one day in the past
y.append(training_set_scaled[i, 0]) #use the regular closing as targets
# Convert the data into array format
X = np.asarray(X)
y = np.asarray(y)
# print(X)
# print('')
# Create a Split index for the data for train-test datasets based on the passed desired ratio
split_idx = int(split_ratio * len(X))
X_train, y_train = X[:split_idx], y[:split_idx]
X_test, y_test = X[split_idx:], y[split_idx:]
# Reshape the 1D arrays to 3D arrays to feed in the LSTM model
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
return X, X_train, y_train, X_test, y_test, training_set_scaled
# Create LSTM model using Dropout
def build_model(X_train, num_node = 100, drop_out = 0.2):
# Create the model using Keras
inputs = keras.layers.Input(shape = (X_train.shape[1], X_train.shape[2] ))
x = keras.layers.LSTM(num_node, return_sequences=True)(inputs)
x = keras.layers.Dropout(drop_out)(x)
x = keras.layers.LSTM(num_node, return_sequences=True)(x)
x = keras.layers.Dropout(drop_out)(x)
x = keras.layers.LSTM(num_node, return_sequences=True)(x)
outputs = keras.layers.Dense(1, activation='linear')(x) #Use 'linear' since we have a continuous stock price
model = keras.Model(inputs = inputs, outputs = outputs)
model.compile(optimizer = 'adam', loss = 'mse' ) #Use 'adam' is the most common
model.summary()
return model
def make_predictions(model, X, stock):
# Make prediction
predicted = model.predict(X)
# Append the predicted values to a list
test_predicted = []
for i in predicted:
test_predicted.append(i[0][0])
return test_predicted
def show_true_and_predicted(stock):
# Build a dataframe with date and SCALED data for the Close and Predicted price
df_predicted = price_volume_df[1:][['Date']]
# Add the SCALED Closing price column to this dataframe
close = []
for i in training_set_scaled:
close.append(i[0])
df_predicted['Close'] = close[1:]
# Add my predictions to this dataframe
df_predicted['Predictions'] = test_predicted
print('')
print('The final Original & Predicted dataframe for stock {}:'.format(stock))
print(df_predicted)
# Show the interactive plot of the true and predicted prices
interactive_plot(df_predicted, 'Original {} Price vs. LSTM Model Predictions'.format(stock))
stock_price_df
stock_vol_df
for i in stock_price_df.columns[6:]:
print('')
print('==============Making Predictions for stock {} ================'.format(str(i)))
# First get the date, close and volume for a stock
stk_price_volume_df = individual_stock(stock_price_df, stock_vol_df, i)
# print(stk_price_volume_df)
X, X_train, y_train, X_test, y_test, training_set_scaled = make_training_data(stk_price_volume_df, split_ratio = 0.70)
print('My Training shape: {},{} and Testing shape: {},{}'.format(X_train.shape, y_train.shape, X_test.shape, y_test.shape ))
model = build_model(X_train, num_node=300, drop_out=0.3)
# Train the model
history = model.fit(X_train, y_train, epochs = 20, batch_size= 32, validation_split=0.2)
test_predicted = make_predictions(model, X, i)
show_true_and_predicted(i)